Transcriptome Profile

library(rmarkdown)
library(rmdformats)
library(dplyr)
library(readr)
library(stringr)
library(tidyr)
library(tibble)
library(magrittr)
library(ggplot2)
library(edgeR)
library(viridisLite)
library(pheatmap)
library(VennDiagram)
library(grid)
library(gridExtra)
library(DT)
library(here)

Import annotations

Load in the ensembl gene annotations here to be used throughout the DTE analysis. This will connect to an ensembl annotation database in which the ensembl version 99 metadata will be stored as an object, giving information on the transcript ID, length, gc content, biotype and other information. This step may take some time as the data can be somewhat large.

library(AnnotationHub)
library(ensembldb)

ah <- AnnotationHub()

ensDb <- ah[["AH78783"]]
genesGR <- genes(ensDb)
transGR <- transcripts(ensDb)
transGR$gene_name <- mcols(genesGR)[mcols(transGR)$gene_id,"gene_name"]
transGR$length <- lengthOf(ensDb, "tx")

ensDb <- ah[["AH10684"]]
transcript_names <- ensDb %>%
  as.data.frame() %>%
  dplyr::filter(type == "transcript") %>%
  dplyr::select("transcript_id",
                "transcript_name")

grch38_txp_anno <- transGR %>%
  as.data.frame() %>%
  dplyr::select("gene_id",
                "gene_name",
                "transcript_id" = "tx_id",
                "width",
                "length",
                "chromosome" = "seqnames",
                "transcript_biotype" = "tx_biotype") %>%
  left_join(transcript_names,
            by = "transcript_id") %>%
  as_tibble()

write_csv(grch38_txp_anno, here("data/grch38_tx_anno.csv.gz"))

grch38_gene_anno <- genesGR %>%
  as.data.frame() %>%
  dplyr::select("gene_id",
                "gene_name",
                "length" = "width",
                "chromosome" = "seqnames",
                "gene_biotype") %>%
  as_tibble()

write_csv(grch38_gene_anno, here("data/grch38_gene_anno.csv.gz"))

ensDb <- ah[["AH10684"]]
transcript_names <- ensDb %>%
  as.data.frame() %>%
  dplyr::filter(type == "transcript") %>%
  dplyr::select("transcript_id",
                "transcript_name")

write_csv(transcript_names, here("data/grch38_transcript_names.csv.gz"))

Read in annotation data

Just read it in here because its quicker

grch38_txp_anno <- read_csv(here("data/grch38_tx_anno.csv.gz"))
grch38_gene_anno <- read_csv(here("data/grch38_gene_anno.csv.gz"))
transcript_names <- read_csv(here("data/grch38_transcript_names.csv.gz"))

Load data

Load the results from each analysis

oxygenGenes <- read_csv(here("data/oxygenGenes.csv.gz"))
oxygenTranscripts <- read_csv(here("data/oxygenTranscripts.csv.gz"))
oxygenDTU <- read_csv(here("data/oxygenDTU.csv.gz"))

dtelist <- read_rds(here("data/dtelist.rds"))

impure_samples <- c(
  "PAC006", "PAC008", "PAC024", "PAC025",
  "PAC034", "PAC035", "PAC039", "PAC041", 
  "PAC036", "PAC045", "PAC071", "PAC131"
  )

pd <- here("data/metadata.csv") %>%
  read_csv() %>%
  dplyr::filter(Cohort == "PAC") %>%
  mutate(Trimester = as.factor(Trimester),
         Timepoint = ifelse(GestationalAge <= 10, "6-10weeks", "11-23weeks")) %>%
  dplyr::filter(!ID %in% impure_samples) %>%
  arrange(ID)

# metadata needs to be in this structure
age_order <- dplyr::arrange(pd, as.numeric(GestationalAge)) %>%
  dplyr::select(ID, GestationalAge)

Differences Between Methods

I’m interested in finding how DGE, DTE, and DTU differ in their results. This isn’t so much a technical goal, but a biological one. Primarily, I am looking for individual transcripts that are either differentially expressed or undergoing changes in proportion while no significant gene expression is seen for their parent gene. This goes for genes undergoing DTU as well. This would mean that changes in transcript expression remain undetected in gene-level analysis. Now I’m not here to talk trash about gene-level analyses, which is why I am equally as interested in the identification of significant genes with no significant transcripts that are differentially expressed or in different proportions. In this case, DGE analysis serves a wonderful purpose in the identification of potentially biologically significant changes in a gene’s output that can not be detected in DTE.

First we need to create subsets of the genes and transcripts which are considered to be differentially expressed. With these subsetted lists we can overlap and record just how similar each one is. For the numbers identified in the venn diagram, we are using only genes, this is because comparing numbers of genes in DGE, and numbers of transcripts in DTE and DTU would give results showing more statistically significant features identified in DTE and DTU, when that is just because there are more transcripts than genes. It would be very misleading. Also we simply would fail to overlap the results. Basically the question here is: what genes do I identify in each analysis and where do these overlap?

sig_genes <- oxygenGenes %>%
  dplyr::filter(FDR < 0.05 & abs(logFC) > 1)

sig_tx <- oxygenTranscripts %>%
  dplyr::filter(FDR < 0.05 & abs(logFC) > 1)

sig_dtu <- oxygenDTU %>%
  dplyr::filter(geneFDR < 0.05)

Prepare objects to call in venn diagram, each object is essentially a list of genes which is featured in each different list. These recorded lists will be overlapped and the number of genes which feature in each of the different lists. Here, take the genes that transcribe significant transcripts in both the differential transcript expression and differential transcript usage results.

DTE_DTU <- sig_dtu %>%
  dplyr::filter(Gene %in% sig_tx$Gene) %>%
  dplyr::select(Gene) %>%
  distinct() %>%
  as.matrix() %>%
  length()

sig_dtu %>%
  dplyr::filter(Gene %in% sig_tx$Gene,
                !Gene %in% sig_genes$Gene) %>%
  dplyr::select(Gene) %>%
  distinct() %>%
  as.matrix() %>%
  length() %>%
  print()
## [1] 50

Also identify here the transcripts that are differentially expressed but also are have a significant change in proportion. There are four genes that are significant in both DTE and DTU. This is because each gene has at least one transcript that has significantly changing proportions that is also differentially expressed. However, they also have another transcript with significant proportion changes but no significant differential expression.

sig_dtu %>%
  dplyr::filter(Transcript %in% sig_tx$Transcript,
                !Gene %in% sig_genes$Gene) %>%
  dplyr::select(Gene) %>%
  distinct() %>%
  as.matrix() %>%
  length() %>%
  print()
## [1] 47
dte_dtu_gene <- sig_dtu %>%
  dplyr::filter(Gene %in% sig_tx$Gene,
                !Gene %in% sig_genes$Gene) %>%
  dplyr::select(Gene) %>%
  distinct()

dte_dtu_tx <- sig_dtu %>%
  dplyr::filter(Transcript %in% sig_tx$Transcript,
                !Gene %in% sig_genes$Gene) %>%
  dplyr::select(Gene) %>%
  distinct()

extra_genes <- dte_dtu_gene %>%
  dplyr::filter(!Gene %in% dte_dtu_tx$Gene)

sig_dtu %>%
  dplyr::filter(Gene %in% sig_tx$Gene,
                !Gene %in% sig_genes$Gene) %>%
  dplyr::filter(Gene %in% extra_genes$Gene) %>%
  dplyr::select("Gene",
                "Gene Name" = "gene_name",
                "Transcript",
                "Transcript Name" = "transcript_name",
                "DTU gene FDR" = "geneFDR",
                "DTU transcript FDR" = "txpFDR") %>%
  left_join(oxygenTranscripts %>%
              dplyr::select("Transcript", "DTE FDR" = "FDR"),
            by = "Transcript") %>%
  datatable()

Now check for genes that are DGE and DTU but not found in DTE. Basically just doing a full check here. What are the differences?

DGE_DTU <- sig_dtu %>%
  dplyr::filter(Gene %in% sig_genes$Gene) %>%
  dplyr::select(Gene) %>%
  distinct() %>%
  as.matrix() %>%
  length()

sig_dtu %>%
  dplyr::filter(Gene %in% sig_genes$Gene,
                !Gene %in% sig_tx$Gene) %>%
  nrow() %>%
  print()
## [1] 0

Now identify genes that overlap across all methods.

DGE_DTE_DTU <- sig_dtu %>%
  dplyr::filter(Gene %in% sig_tx$Gene,
                Gene %in% sig_genes$Gene) %>%
  dplyr::select(Gene) %>%
  distinct() %>%
  as.matrix() %>%
  length()

sig_dtu %>%
  dplyr::filter(Gene %in% sig_tx$Gene,
                Gene %in% sig_genes$Gene) %>%
  dplyr::select(Gene) %>%
  distinct() %>%
  as.matrix() %>%
  length() %>%
  print()
## [1] 27

Now lets grab genes in both DTE and DGE, but not in DTU

DTE_DGE <- sig_genes %>%
  dplyr::filter(Gene %in% sig_tx$Gene) %>%
  dplyr::select(Gene) %>%
  distinct() %>%
  as.matrix() %>%
  length()

sig_genes %>%
  dplyr::filter(Gene %in% sig_tx$Gene) %>%
  dplyr::select(Gene) %>%
  distinct() %>%
  as.matrix() %>%
  length() %>%
  print()
## [1] 634

Now we get the total numbers for the area parameters, so we know how many genes were identified in each.

DGE_number <- sig_genes$Gene %>%
  unique() %>%
  length()

DTE_number <- sig_tx$Gene %>%
  unique() %>%
  length()

DTU_number <- sig_dtu$Gene %>%
  unique() %>%
  length()

The objects created in the previous code chunck are used here to plot the triple venn diagram which records the level of overlap between each of the different analyses run.

grid.newpage()

v <- draw.triple.venn(
  area1 = DGE_number,
  area2 = DTE_number,
  area3 = DTU_number,
  n12 = DTE_DGE,
  n13 = DGE_DTU,
  n23 = DTE_DTU,
  n123 = DGE_DTE_DTU,
  category = c(
    "DGE",
    "DTE",
    "DTU"),
  fill = c(
    "blue", 
    "red",
    "green"),
  lty = "blank",
  cex = 2,
  fontfamily = "sans",
  cat.fontfamily = "sans",
  cat.pos = c(200, 160, 360),
  cat.dist = c(0.05, 0.06, 0.075),
  cat.cex = 2,
  scaled = FALSE
)

Can add a title if you want, but not necessary for this R markdown

grid.arrange(
  gTree(children = v),
  top = textGrob("Overlap of significant genes from DGE, DTE, and DTU analysis",
                 gp = gpar(fontsize = 16,
                           font = 2),
                 vjust = 1.5)
)

Greatest Differences Between Gene and Transcript Expression

In this section I compare the DGE and DTE results in order to find which transcripts are most genuinely masked by aggregation to the gene level. Sometimes transcripts can be masked by expression of a more “dominant” transcripts with multiple folds more expression than it, or a sister transcript that has an similar but opposite change in expression that neutralises any effect on the gene level, or if it has many other transcripts with similar changes in expression that are collapsed into a much seemingly smaller change on the gene level. Here we can take a look at whats happening in the data.

DET_DEG_logFC_diff <- oxygenTranscripts %>%
  dplyr::select("Transcript",
                "Gene",
                "GeneName",
                "logCPM",
                "logFC",
                "FDR") %>%
  left_join(oxygenGenes %>%
              dplyr::select(
                "Gene",
                "gene_logFC" = "logFC",
                "geneFDR" = "FDR"
              ),
            by = "Gene") %>%
  left_join(transcript_names,
            by = c("Transcript" = "transcript_id")) %>%
  mutate(logFC_diff = (abs(logFC) - abs(gene_logFC))) %>%
  arrange(desc(logFC_diff))

DET_DEG_logFC_diff %>% dplyr::filter(logFC_diff > 1) %>%
  dplyr::filter(geneFDR > 0.05 & abs(gene_logFC) < 1) %>%
  dplyr::select(
    "Transcript ID" = "Transcript",
    "Transcript Name" = "transcript_name",
    "Gene Name" = "GeneName",
    "txp logFC" = "logFC",
    "logFC Diff" = "logFC_diff",
    "txp FDR" = "FDR",
    "gene FDR" = "geneFDR"
  )
## # A tibble: 15 × 7
##    `Transcript ID` `Transcript Name` `Gene Name` `txp logFC` `logFC Diff`
##    <chr>           <chr>             <chr>             <dbl>        <dbl>
##  1 ENST00000593948 PSG9-008          PSG9              -2.76         2.35
##  2 ENST00000397806 HBA2-002          HBA2              -1.99         1.99
##  3 ENST00000583025 SLC16A3-018       SLC16A3           -2.51         1.75
##  4 ENST00000558733 ADAM10-015        ADAM10            -2.13         1.75
##  5 ENST00000407568 PSG5-005          PSG5              -1.87         1.68
##  6 ENST00000381318 ITSN1-001         ITSN1              1.73         1.26
##  7 ENST00000372360 RPS24-001         RPS24             -1.45         1.25
##  8 ENST00000517581 AZIN1-012         AZIN1             -1.31         1.22
##  9 ENST00000592790 VMP1-004          VMP1               1.43         1.22
## 10 ENST00000282397 FLT1-001          FLT1               1.19         1.18
## 11 ENST00000553964 CALM1-005         CALM1              1.24         1.17
## 12 ENST00000427865 MGAT1-004         MGAT1              1.67         1.17
## 13 ENST00000297488 MTUS1-005         MTUS1              1.41         1.14
## 14 ENST00000461482 TFPI2-002         TFPI2              1.37         1.03
## 15 ENST00000528031 GDPD5-012         GDPD5              1.14         1.02
## # ℹ 2 more variables: `txp FDR` <dbl>, `gene FDR` <dbl>
significance_diff <- DET_DEG_logFC_diff %>% dplyr::filter(geneFDR > 0.05)

DET_DEG_logFC_diff %>%
  dplyr::filter(is.na(geneFDR)) %>%
  dplyr::select(
    "Transcript ID" = "Transcript",
    "Transcript Name" = "transcript_name",
    "Gene Name" = "GeneName",
    "logFC",
    "FDR"
  )
## # A tibble: 129 × 5
##    `Transcript ID` `Transcript Name` `Gene Name` logFC           FDR
##    <chr>           <chr>             <chr>       <dbl>         <dbl>
##  1 ENST00000356897 IL27-001          IL27        -1.75 0.00000000221
##  2 ENST00000370565 CLCA2-001         CLCA2        2.82 0.00000000528
##  3 ENST00000343529 RELN-002          RELN        -2.62 0.0000000782 
##  4 ENST00000289746 CDH15-001         CDH15        2.13 0.000000176  
##  5 ENST00000297439 DEFB1-001         DEFB1       -2.43 0.000000918  
##  6 ENST00000340634 PAQR9-001         PAQR9       -1.40 0.00000532   
##  7 ENST00000356495 PIGR-001          PIGR        -2.17 0.0000133    
##  8 ENST00000262178 VIPR2-001         VIPR2        2.04 0.0000144    
##  9 ENST00000341948 PCDHB13-001       PCDHB13      1.56 0.0000200    
## 10 ENST00000310954 SLCO4C1-001       SLCO4C1     -1.73 0.0000310    
## # ℹ 119 more rows
DT::datatable(DET_DEG_logFC_diff)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

We should check to make sure the logFCs are not in the opposite direction between our gene and transcript counts. If differences in logFC cancel out then these results would be telling us the wrong thing

sig_tx %>%
  dplyr::select(
    "Gene_txp" = "Gene",
    "logFC_txp" = "logFC") %>%
  left_join(sig_genes %>%
              dplyr::select(
                "Gene_gene" = "Gene",
                "logFC_gene" = "logFC"
              ),
            by = c("Gene_txp" = "Gene_gene")) %>%
  ggplot(
    aes(logFC_txp,
        logFC_gene)
    ) +
  geom_point() +
  xlab("logFC transcripts") +
  ylab("logFC genes") +
  theme_bw() +
  theme(axis.title = element_text(size = 20),
        axis.text = element_text(size = 20))
## Warning: Removed 252 rows containing missing values or values outside the scale range
## (`geom_point()`).

Compare FDR and logFC between genes and transcripts

Lets see if there are any major differences between the FDR values for genes and transcripts, then extend this check to logFC

gene_tx_diff <- DET_DEG_logFC_diff %>%
  dplyr::filter(logFC_diff > 1) %>%
  dplyr::filter(geneFDR > 0.05 & abs(gene_logFC) < 1) %>%
  dplyr::select(
    "Transcript",
    "TranscriptName" = "transcript_name",
    "Gene",
    "GeneName",
    "txp_logFC" = "logFC",
    "gene_logFC",
    "logFC_diff",
    "txpFDR" = "FDR",
    "geneFDR"
  ) 

DT::datatable(gene_tx_diff)

Scale CPM for heatmap

Now we will plot out these differences. We will use a heatmap that can be made with the pheatmap package. I prefer this over manually making one with ggplot2 as we can make nice annotation bars without needing to call grid or cowplot We just want to make sure we have defined the order of our clusters in clusterOrder for visualisation purposes and appropriately scale our values to get the contrasting colours in the plot. Then lock everything up as a matrix

## Scale by rows
# In this function we are calculating the mean and standard deviation for every row of the
# matrix. Then we subtract the mean from the raw value and divide by the calculated standard
# deviation to generate our scaled values. Dividing by the standard deviation means that

scale_rows <- function(x){
  m = apply(x, 1, mean, na.rm = T)
  s = apply(x, 1, sd, na.rm = T)
  return((x - m) / s)
}

clusterOrder <- dtelist %>%
  cpm() %>%
  as.data.frame() %>%
  rownames_to_column(var = "Transcript") %>%
  dplyr::filter(Transcript %in% gene_tx_diff$Transcript) %>%
  dplyr::arrange(Transcript) %>%
  set_rownames(gene_tx_diff %>%
                 dplyr::filter(!Transcript == "ENST00000372728") %>%
                 dplyr::arrange(Transcript) %>%
                 dplyr::select(Transcript) %>%
                 as.matrix() %>%
                 as.character()) %>%
  dplyr::select(-Transcript) %>%
  as.matrix() %>%
  dist() %>%
  hclust() %>%
  as.dendrogram(method = "ward.D2") %>%
  order.dendrogram()

scaled_cpm <- dtelist %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  rownames_to_column(var = "Transcript") %>%
  dplyr::filter(Transcript %in% gene_tx_diff$Transcript) %>%
  dplyr::arrange(Transcript) %>%
  set_rownames(gene_tx_diff %>%
                 dplyr::filter(!Transcript == "ENST00000372728") %>%
                 dplyr::arrange(Transcript) %>%
                 dplyr::select(Transcript) %>%
                 as.matrix() %>%
                 as.character()) %>%
  dplyr::select(-Transcript) %>%
  as.matrix()

Plot pheatmap

With everything ready lets make this heatmap. We also want to define out annotation column. This should have rownames that correspond to the sample IDs in our matrix. This way, pheatmap will automatically know how to visualise the annotations

clusteredMatrix <- scaled_cpm[clusterOrder, ]
clusteredMatrix <- clusteredMatrix[, (pd %>% 
                                        dplyr::filter(
                                          ID %in% colnames(clusteredMatrix)
                                        ) %>%
                                        dplyr::arrange(GestationalAge) %>%
                                        dplyr::select(ID) %>%
                                        as.matrix() %>%
                                        as.character)]

clusteredMatrix <- clusteredMatrix %>%
  as.data.frame() %>%
  rownames_to_column(var = "transcript_id") %>%
  left_join(transcript_names,
            by = "transcript_id") %>%
  dplyr::select(-transcript_id) %>%
  column_to_rownames("transcript_name") %>%
  as.matrix()

ann_col <- pd %>%
  dplyr::filter(ID %in% colnames(clusteredMatrix)) %>%
  dplyr::select(ID, "Gestation Group " = "Timepoint") %>%
  as.data.frame() %>%
  column_to_rownames("ID")

gene_row_order <- transcript_names %>%
  dplyr::filter(transcript_name %in% rownames(clusteredMatrix)) %>%
  left_join(oxygenTranscripts,
            by = c("transcript_id" = "Transcript")) %>%
  dplyr::select("Gene",
                "GeneName",
                "Transcript" = "transcript_id",
                "TranscriptName" = "transcript_name") %>%
  column_to_rownames("TranscriptName")

gene_row_order[clusterOrder, ]
##                        Gene GeneName      Transcript
## MGAT1-004   ENSG00000131446    MGAT1 ENST00000427865
## CALM1-005   ENSG00000198668    CALM1 ENST00000553964
## MTUS1-005   ENSG00000129422    MTUS1 ENST00000297488
## GDPD5-012   ENSG00000158555    GDPD5 ENST00000528031
## SLC16A3-018 ENSG00000141526  SLC16A3 ENST00000583025
## FLT1-001    ENSG00000102755     FLT1 ENST00000282397
## AZIN1-012   ENSG00000155096    AZIN1 ENST00000517581
## HBA2-002    ENSG00000188536     HBA2 ENST00000397806
## ITSN1-001   ENSG00000205726    ITSN1 ENST00000381318
## ADAM10-015  ENSG00000137845   ADAM10 ENST00000558733
## RPS24-001   ENSG00000138326    RPS24 ENST00000372360
## PSG5-005    ENSG00000204941     PSG5 ENST00000407568
## VMP1-004    ENSG00000062716     VMP1 ENST00000592790
## TFPI2-002   ENSG00000105825    TFPI2 ENST00000461482
## PSG9-008    ENSG00000183668     PSG9 ENST00000593948

And now we have the heatmap ready for visualisation

pheatmap(clusteredMatrix,
         color = plasma(84),
         border_color = NA,
         show_colnames = FALSE,
         cluster_cols = FALSE,
         main = paste0(
           "Top 15 transcripts with greatest\n expression",
           " differences to the gene level"
         ),
         fontsize = 10,
         gaps_col = 27,
         cutree_rows = 4,
         clustering_method = "ward.D2",
         annotation_col = ann_col)

Transcriptome profile

In this section I simply ask a few questions of the data. The purpose is so that I can offer a more in-depth understanding of what is within the data. This is kind of a data-driven approach, I’m not certain on what to find, and I am not testing anything. Just a bit of fun exploring to do

Analysis Result Info

How many significant genes across all analyses?

all_genes <- c(sig_tx$Gene, sig_genes$Gene, sig_dtu$Gene) %>%
  unique()

print(length(all_genes))
## [1] 1651

How many non coding in all 1642 genes?

grch38_gene_anno %>%
  dplyr::filter(gene_id %in% all_genes) %>%
  dplyr::count(gene_biotype, sort = TRUE) %>%
  print()
## # A tibble: 7 × 2
##   gene_biotype                           n
##   <chr>                              <int>
## 1 protein_coding                      1629
## 2 lncRNA                                10
## 3 transcribed_unprocessed_pseudogene     4
## 4 polymorphic_pseudogene                 3
## 5 IG_C_gene                              2
## 6 unprocessed_pseudogene                 2
## 7 transcribed_unitary_pseudogene         1

What are the lncRNAs?

lncRNAs <- grch38_gene_anno %>%
  dplyr::filter(gene_id %in% all_genes) %>%
  dplyr::filter(gene_biotype == "lncRNA")

sig_genes %>%
  dplyr::filter(Gene %in% lncRNAs$gene_id) %>%
  print()
## # A tibble: 9 × 11
##   Gene       GeneName length chromosome Gene_biotype logFC unshrunk.logFC logCPM
##   <chr>      <chr>     <dbl> <chr>      <chr>        <dbl>          <dbl>  <dbl>
## 1 ENSG00000… LINC028…   9435 7          lncRNA       -4.05          -4.06  2.47 
## 2 ENSG00000… MSC-AS1  290227 8          lncRNA        2.69           2.70  2.10 
## 3 ENSG00000… C1QTNF1…   9543 17         lncRNA       -1.64          -1.64  2.19 
## 4 ENSG00000… LINC011… 123865 2          lncRNA       -1.38          -1.38  4.67 
## 5 ENSG00000… AC07961…   7794 2          lncRNA        2.03           2.04  0.953
## 6 ENSG00000… AC08011…   4039 17         lncRNA       -1.02          -1.03  1.90 
## 7 ENSG00000… LINC015…  21889 5          lncRNA       -1.04          -1.04  1.05 
## 8 ENSG00000… C8orf31   21476 8          lncRNA       -1.14          -1.15  0.937
## 9 ENSG00000… MIRLET7…  60060 22         lncRNA        1.09           1.09  1.09 
## # ℹ 3 more variables: PValue <dbl>, FDR <dbl>, DE <lgl>
sig_tx %>%
  dplyr::filter(Gene %in% lncRNAs$gene_id) %>%
  print()
## # A tibble: 6 × 14
##   Transcript      Gene      GeneName  width length chromosome Transcript_biotype
##   <chr>           <chr>     <chr>     <dbl>  <dbl> <chr>      <chr>             
## 1 ENST00000457356 ENSG0000… MSC-AS1  212274    413 8          lncRNA            
## 2 ENST00000409974 ENSG0000… LINC028…   9400   2713 7          lncRNA            
## 3 ENST00000581579 ENSG0000… C1QTNF1…   9543   2093 17         lncRNA            
## 4 ENST00000409518 ENSG0000… LINC011…   5970    751 2          lncRNA            
## 5 ENST00000409912 ENSG0000… LINC011…   5973    718 2          lncRNA            
## 6 ENST00000360737 ENSG0000… MIRLET7…  27926    795 22         lncRNA            
## # ℹ 7 more variables: TranscriptName <chr>, logFC <dbl>, unshrunk.logFC <dbl>,
## #   logCPM <dbl>, PValue <dbl>, FDR <dbl>, DE <lgl>
sig_dtu %>%
  dplyr::filter(Gene %in% lncRNAs$gene_id) %>%
  print()
## # A tibble: 5 × 18
##   Gene         Transcript  geneFDR   txpFDR seqnames  start    end  width strand
##   <chr>        <chr>         <dbl>    <dbl> <chr>     <dbl>  <dbl>  <dbl> <chr> 
## 1 ENSG0000021… ENST00000… 3.32e-12 8.15e-14 2        2.40e8 2.40e8   7400 -     
## 2 ENSG0000021… ENST00000… 3.32e-12 5.93e-12 2        2.40e8 2.40e8   7400 -     
## 3 ENSG0000021… ENST00000… 3.32e-12 1.44e- 1 2        2.40e8 2.40e8   7400 -     
## 4 ENSG0000022… ENST00000… 5.29e- 3 0        2        4.67e7 4.68e7 123865 +     
## 5 ENSG0000022… ENST00000… 5.29e- 3 0        2        4.67e7 4.68e7 123865 +     
## # ℹ 9 more variables: gene_name <chr>, gene_biotype <chr>,
## #   seq_coord_system <chr>, description <chr>, gene_id_version <chr>,
## #   symbol <chr>, entrezid <lgl>, transcript_name <chr>, DTU <lgl>

What are all the significant transcripts between DTE and DTU?

all_tx <- c(sig_tx$Transcript, sig_dtu$Transcript) %>%
  unique()

print(length(all_tx))
## [1] 2364
all_tx_genes <- c(sig_tx$Gene, sig_dtu$Gene) %>%
  unique()

print(length(all_tx_genes))
## [1] 1401

How many isoforms did these genes have?

oxygenTranscripts %>% 
  dplyr::select("Gene",
                "Transcript") %>%
  rbind(oxygenDTU %>%
          dplyr::select("Gene",
                        "Transcript")) %>%
  dplyr::filter(Gene %in% all_tx_genes) %>%
  distinct() %>%
  dplyr::count(Gene, sort = TRUE) %>%
  dplyr::count(n, sort = TRUE, name = "Number of Genes") %>%
  dplyr::rename("Isoforms" = "n") %>%
  DT::datatable()

What types of transcripts were they?

grch38_txp_anno %>%
  dplyr::filter(transcript_id %in% all_tx) %>%
  dplyr::count(transcript_biotype, sort = TRUE) %>%
  DT::datatable()

Info on the triple overlap genes

triple_genes <- sig_dtu %>%
  dplyr::filter(Gene %in% sig_tx$Gene,
                Gene %in% sig_genes$Gene) %>%
  dplyr::select(Gene) %>%
  distinct()

What are the overlapped genes in DTE results?

oxygenTranscripts %>%
  dplyr::filter(Gene %in% triple_genes$Gene) %>%
  dplyr::select("Gene",
                "GeneName",
                "Transcript",
                "TranscriptName",
                "logFC",
                "FDR") %>%
  dplyr::count(GeneName, sort = TRUE) %>%
  DT::datatable()

What are the overlapped genes in DTU results?

oxygenDTU %>%
  dplyr::filter(Gene %in% triple_genes$Gene) %>%
  dplyr::select("Gene",
                "GeneName" = "gene_name",
                "Transcript",
                "TranscriptName" = "transcript_name",
                "geneFDR",
                "txpFDR") %>%
  dplyr::count(GeneName, sort = TRUE) %>%
  DT::datatable()

What are the significant overlapped genes in DTE results?

oxygenTranscripts %>%
  dplyr::filter(Gene %in% triple_genes$Gene) %>%
  dplyr::select("Gene",
                "GeneName",
                "Transcript",
                "TranscriptName",
                "logFC",
                "FDR") %>%
  dplyr::filter(FDR < 0.05 & abs(logFC) > 1) %>%
  dplyr::count(GeneName, sort = TRUE) %>%
  DT::datatable()

What are the significant overlapped genes in DTU results?

oxygenDTU %>%
  dplyr::filter(Gene %in% triple_genes$Gene) %>%
  dplyr::select("Gene",
                "GeneName" = "gene_name",
                "Transcript",
                "TranscriptName" = "transcript_name",
                "geneFDR",
                "txpFDR") %>%
  dplyr::filter(geneFDR < 0.05 & txpFDR < 0.05) %>%
  dplyr::count(GeneName, sort = TRUE) %>%
  dplyr::filter(n > 1) %>%
  DT::datatable()

The same but which ones appear only once?

oxygenDTU %>%
  dplyr::filter(Gene %in% triple_genes$Gene) %>%
  dplyr::select("Gene",
                "GeneName" = "gene_name",
                "Transcript",
                "TranscriptName" = "transcript_name",
                "geneFDR",
                "txpFDR") %>%
  dplyr::filter(geneFDR < 0.05 & txpFDR < 0.05) %>%
  dplyr::count(GeneName, sort = TRUE) %>%
  dplyr::filter(n == 1) %>%
  DT::datatable()

Transcript specific information

What are their biotypes?

oxt_tx_triple <- oxygenTranscripts %>%
  dplyr::filter(Gene %in% triple_genes$Gene)

oxy_dtu_triple <- oxygenDTU %>% 
  dplyr::filter(Gene %in% triple_genes$Gene)

Check to see we have complete representation of all transcripts here and then count the transcript biotypes

all(oxy_dtu_triple$Transcript %in% oxt_tx_triple$Transcript)
## [1] TRUE
oxt_tx_triple %>%
  dplyr::count(Transcript_biotype) %>%
  DT::datatable()

What transcripts have retained introns?

oxt_tx_triple %>% 
  dplyr::filter(Transcript_biotype == "retained_intron") %>%
  DT::datatable()

What transcripts have untranslated transcripts?

oxt_tx_triple %>%
  dplyr::filter(Transcript_biotype == "processed_transcript") %>%
  DT::datatable()

Which transcripts are targeted by nonsense mediated decay?

oxt_tx_triple %>%
  dplyr::filter(Transcript_biotype == "nonsense_mediated_decay") %>%
  DT::datatable()

Isoform information in genes

How many isoforms transcribed by each gene?

sig_tx %>%
  dplyr::count(Gene, sort = TRUE) %>%
  DT::datatable()
sig_tx %>% 
  dplyr::count(Gene,
               sort = TRUE, 
               name = "isoforms") %>%
  dplyr::count(isoforms,
               sort = TRUE) %>%
  DT::datatable()

Which genes had the most amount of significantly DE isoforms?

sig_tx %>%
  dplyr::count(Gene, sort = TRUE) %>%
  left_join(sig_tx %>%
              dplyr::select("Gene", "GeneName") %>%
              distinct(),
            by = "Gene") %>%
  DT::datatable()

Heatmap transcripts information

oxygenTranscripts %>%
  dplyr::filter(TranscriptName %in% rownames(clusteredMatrix)) %>%
  DT::datatable()